{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AIzaSyAZVBIe2yMB7TckGFRBvnoumMX_zMtM6LU\n",
      "cloud-training-demos\n"
     ]
    }
   ],
   "source": [
    "APIKEY=\"AIzaSyAZVBIe2yMB7TckGFRBvnoumMX_zMtM6LU\"   # CHANGE\n",
    "print APIKEY\n",
    "\n",
    "PROJECT_ID = \"cloud-training-demos\"  # CHANGE\n",
    "print PROJECT_ID \n",
    "\n",
    "BUCKET = \"cloud-training-demos-ml\"   # CHANGE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['BUCKET'] = BUCKET\n",
    "os.environ['PROJECT'] = PROJECT_ID"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2> Finding specific text in a corpus of scanned documents </h2>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from googleapiclient.discovery import build"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['gs://cloud-training-demos-ml/unstructured/photos/snapshot1.png', 'gs://cloud-training-demos-ml/unstructured/photos/snapshot2.png', 'gs://cloud-training-demos-ml/unstructured/photos/snapshot3.png', 'gs://cloud-training-demos-ml/unstructured/photos/snapshot4.png', 'gs://cloud-training-demos-ml/unstructured/photos/snapshot5.png', 'gs://cloud-training-demos-ml/unstructured/photos/snapshot6.png', 'gs://cloud-training-demos-ml/unstructured/photos/snapshot7.png', 'gs://cloud-training-demos-ml/unstructured/photos/snapshot8.png']\n"
     ]
    }
   ],
   "source": [
    "import subprocess\n",
    "images = subprocess.check_output([\"gcloud\", \"storage\", \"ls\", \"gs://{}/unstructured/photos\".format(BUCKET)])\n",
    "images = list(filter(None,images.split('\\n')))\n",
    "print(images)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here are a few of the images we are going to search.\n",
    "\n",
    "<img src=\"https://storage.googleapis.com/cloud-training-demos-ml/unstructured/photos/snapshot1.png\" />\n",
    "<img src=\"https://storage.googleapis.com/cloud-training-demos-ml/unstructured/photos/snapshot2.png\" />\n",
    "<img src=\"https://storage.googleapis.com/cloud-training-demos-ml/unstructured/photos/snapshot5.png\" />"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gs://cloud-training-demos-ml/unstructured/photos/snapshot1.png\n",
      "gs://cloud-training-demos-ml/unstructured/photos/snapshot2.png\n",
      "image=gs://cloud-training-demos-ml/unstructured/photos/snapshot2.png contains the following text: ARR DELAY\n",
      "DEP DELAY\n",
      "count 45792.000000 46057.000000\n",
      "mean 45.797650 50.822068\n",
      "std 62.863612 61.079590\n",
      "min -46.000000 10.000000\n",
      "25%\n",
      "11.000000\n",
      "17.000000\n",
      "5096 27000000 30.000000\n",
      "75% 59000000 60.000000\n",
      "max 1321.000000 1330.000000\n",
      "\n",
      "image=gs://cloud-training-demos-ml/unstructured/photos/snapshot2.png contains the following text: 1321.000000\n",
      "gs://cloud-training-demos-ml/unstructured/photos/snapshot3.png\n",
      "gs://cloud-training-demos-ml/unstructured/photos/snapshot4.png\n",
      "gs://cloud-training-demos-ml/unstructured/photos/snapshot5.png\n",
      "gs://cloud-training-demos-ml/unstructured/photos/snapshot6.png\n",
      "gs://cloud-training-demos-ml/unstructured/photos/snapshot7.png\n",
      "gs://cloud-training-demos-ml/unstructured/photos/snapshot8.png\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Running Vision API to find images that have a specific search term\n",
    "import base64\n",
    "\n",
    "SEARCH_TERM = u\"1321\"\n",
    "\n",
    "for IMAGE in images:\n",
    "  print(IMAGE)\n",
    "  vservice = build('vision', 'v1', developerKey=APIKEY)\n",
    "  request = vservice.images().annotate(body={\n",
    "        'requests': [{\n",
    "                'image': {\n",
    "                    'source': {\n",
    "                        'gcs_image_uri': IMAGE\n",
    "                    }\n",
    "                },\n",
    "                'features': [{\n",
    "                    'type': 'TEXT_DETECTION',\n",
    "                    'maxResults': 100,\n",
    "                }]\n",
    "            }],\n",
    "        })\n",
    "  outputs = request.execute(num_retries=3)\n",
    "  # print outputs\n",
    "  if 'responses' in outputs and len(outputs['responses']) > 0 and 'textAnnotations' in outputs['responses'][0]:\n",
    "    for output in outputs['responses'][0]['textAnnotations']:\n",
    "       if SEARCH_TERM in output['description']:\n",
    "          print(\"image={} contains the following text: {}\".format(IMAGE, output['description']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2> Translating a large document in parallel. </h2>\n",
    "\n",
    "As the number of items increases, we need to parallelize the cells. Here, we translate Alice in Wonderland into Spanish.\n",
    "\n",
    "<p>\n",
    "\n",
    "This cell creates a worker that calls the Translate API.  This has to be done on each Spark worker; this is why the import is within the function.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def executeTranslate(inputs):\n",
    "  from googleapiclient.discovery import build\n",
    "  service = build('translate', 'v2', developerKey=APIKEY)\n",
    "  translator = service.translations()\n",
    "  outputs = translator.list(source='en', target='es', q=inputs).execute()  \n",
    "  return outputs['translations'][0]['translatedText']\n",
    "\n",
    "print(\"Added executeTranslate() function.\")\n",
    "s"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we are ready to execute the above function in parallel on the Spark cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "alice = sc.textFile(\"gs://cpb103-public-files/alice-short-transformed.txt\")\n",
    "alice = alice.map(lambda x: x.split(\".\"))\n",
    "                  \n",
    "for eachSentence in alice.take(10):\n",
    "  print(\"{0}\".format(eachSentence))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "Note: The book has also been transformed so all the new lines have been removed.   This allows the book to be imported as a long\n",
    "string.  The text is then split on the periods to create an array of strings.  The loop just shows the input.\n",
    "\n",
    "<p>\n",
    "\n",
    "This code runs the translation in parallel on the Spark cluster and shows the results."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "aliceTranslated = alice.map(executeTranslate)\n",
    "\n",
    "for eachSentance in aliceTranslated.take(10):\n",
    "    print(\"{0}\".format(eachSentance))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2> Sentiment analysis in parallel </h2>\n",
    "\n",
    "Here, we do sentiment analysis on a bunch of text in parallel.  This is similar to the translate."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Added executeSentimentAnalysis() function.\n"
     ]
    }
   ],
   "source": [
    "def executeSentimentAnalysis(quote):\n",
    "    from googleapiclient.discovery import build\n",
    "    lservice = build('language', 'v1beta1', developerKey=APIKEY)\n",
    "\n",
    "    response = lservice.documents().analyzeSentiment(\n",
    "        body={\n",
    "            'document': {\n",
    "                'type': 'PLAIN_TEXT',\n",
    "                'content': quote\n",
    "            }\n",
    "        }).execute()\n",
    "    \n",
    "    return response\n",
    "\n",
    "print(\"Added executeSentimentAnalysis() function.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "But this time, instead of processing a text file, let's process data from BigQuery.  We will pull articles on JavaScript from Hacker News and look at the sentiment associated with them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Imports run.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from pandas.io import gbq\n",
    "\n",
    "print(\"Imports run.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running query...\n",
      "Requesting query... ok.\n",
      "Query running...\n",
      "Query done.\n",
      "Cache hit.\n",
      "\n",
      "Retrieving results...\n",
      "Got 10 rows.\n",
      "\n",
      "Total time taken 0.83 s.\n",
      "Finished at 2017-02-12 22:54:45.\n",
      "[Row(title=u'Lead JavaScript for innovative intelligent learning platform at ExpertKnowledge', text=u'About Us\\nAt ExpertKnowledge, we provide automation platform to extract wisdom and transform it into high-quality online learning courses that are certified and delivered on any device.\\nOur team is looking to add an experienced Front End Engineer to lead the design of our web platform - both mobile and desktop.\\nYou\\u2019ll collaborate with our UX designers to build great user experiences. You will work closely with talented engineers to build a scalable, resilient, and well-designed applications. You\\u2019ll own, architect, and expand key pieces of our platform, mentor junior engineers, and have a huge impact on the product from conception to launch.<p>You will:\\nLead the design and development of next generation content management and publishing platform\\nOwn the front-end of our web platform across desktop and mobile\\nCollaborate with UX Designers, developers and internal stakeholders to build innovative and scalable features\\nMentor members of the team\\nContinuously develop expertise and knowledge on best practices and latest technology trends in software development<p>This describes you:\\nPassionate about designing innovative and functional web applications\\nYou enjoy collaborating on difficult to solve projects and you take pride in creating great software\\nYou sleep and breathe JavaScript, HTML and CSS\\nYou care about performance, scalability, maintainability and the end user\\nParticipate in open source projects \\u2013 we would love to see your GitHub projects\\nYou have sense of humor and you check your ego at the door<p>Qualifications:\\nVery strong web fundamentals, including experience with JavaScript frameworks and REST based API concepts and techniques<p>We would like you to have:\\nFamiliarity with Social Networks APIs and Authentication models, data storage and retrieval methods<p>The Package:\\nGenerous Incentives\\nCompetitive compensation\\nPaid vacation\\nTremendous growth opportunity\\nLocation: Houston, TX<p>Contact: Sebastian\\nsl[at}in[Em dash]acuity.com'), Row(title=u'Landing.js: JavaScript library for create simple landing page with bing api', text=u'Landing.js: Javascript library for create simple landing page with bing api<p>Demo1: http:&#x2F;&#x2F;vah7id.com&#x2F;project&#x2F;Landingjs&#x2F;demo1\\nDemo2: http:&#x2F;&#x2F;vah7id.com&#x2F;project&#x2F;Landingjs&#x2F;demo2')]\n"
     ]
    }
   ],
   "source": [
    "print 'Running query...'\n",
    "df = gbq.read_gbq(\"\"\"\n",
    "SELECT title, text \n",
    "FROM [bigquery-public-data:hacker_news.stories] \n",
    "where text > \" \" and title contains(\"JavaScript\") \n",
    "LIMIT 10\n",
    "\"\"\", project_id=PROJECT_ID)\n",
    "\n",
    "#Convert Pandas DataFrame to RDD\n",
    "rdd = sqlContext.createDataFrame(df).rdd\n",
    "\n",
    "print(rdd.take(2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score:0.5 and Magnitde:3.2\n",
      "Score:0.2 and Magnitde:0.2\n",
      "Score:0.2 and Magnitde:0.5\n",
      "Score:0.2 and Magnitde:2.1\n",
      "Score:-0.7 and Magnitde:0.7\n",
      "Score:0.3 and Magnitde:0.6\n",
      "Score:0.2 and Magnitde:1.5\n",
      "Score:-0.5 and Magnitde:0.5\n",
      "Score:0.1 and Magnitde:1\n",
      "Score:0.1 and Magnitde:1.9\n"
     ]
    }
   ],
   "source": [
    "# extract text field from Dictionary\n",
    "comments = rdd.map(lambda x: x[1])\n",
    "sentiments = comments.map(executeSentimentAnalysis)\n",
    "\n",
    "for sentiment in sentiments.collect():\n",
    "    print(\"Score:{0} and Magnitude:{1}\".format(sentiment['documentSentiment']['score'], sentiment['documentSentiment']['magnitude']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note: The function will return a score and magnitude for each string passed.  A score greater than zero is considered positive, and a score less than zero is negative.  Magnitude is a measure of how strong the sentiment is."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
